Loading packages and data ———————————————–

library(dplyr)
library(scater)
library(SeuratDisk)
library(Seurat)
library(ggplot2)
library(RColorBrewer)
library(readr)

Loading in the h5seurat files

real_validation <- LoadH5Seurat("BrainSmall-realData_2kTest.h5seurat") # real data

scgan_validation <- LoadH5Seurat("scGAN_2kTest.h5seurat") # scGAN data

activa_validation <- LoadH5Seurat("20kBrainSmall-1997Generated_10RecLoss_Epoch600.h5seurat") # ACTIVA data

# Changing the barcodes in ACTIVA validation set to be the same as the real validation set
rownames(activa_validation@meta.data) <- rownames(real_validation@meta.data)

Adding metadata ———————————————————

# Generate dataset labels for the metadata
real_labs <- rep("Real (Test Set)", nrow(real_validation@meta.data))
activa_labs <- rep("ACTIVA", nrow(activa_validation@meta.data))
scgan_labs <- rep("scGAN", nrow(scgan_validation@meta.data))


real_validation <- AddMetaData(real_validation,
                               metadata = real_labs,
                               col.name = "dataset_label") # adding dataset label to metadata

scgan_validation <- AddMetaData(scgan_validation,
                                metadata = scgan_labs,
                                col.name = "dataset_label") # adding dataset label to metadata

activa_validation <- AddMetaData(activa_validation,
                                 metadata = activa_labs,
                                 col.name = "dataset_label") # adding dataset label to metadata

Dataset Integration —————————————————–

Variable Gene Selection

# 3000 highly variable genes identified using variance stabilizing transformation - vst
real_validation <- FindVariableFeatures(object = real_validation,
                                        nfeatures = 3000,
                                        selection.method = "vst")

scgan_validation <- FindVariableFeatures(object = scgan_validation,
                                         nfeatures = 3000,
                                         selection.method = "vst")

activa_validation <- FindVariableFeatures(object = activa_validation,
                       nfeatures = 3000,
                       selection.method = "vst")

Integration

val_list <- list(real_validation, scgan_validation, activa_validation)

#Integration
validation.anchors <-
  FindIntegrationAnchors(object.list = val_list,
                         dims = 1:30,
                         anchor.features = 3000)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Computing 3000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4873 anchors
## Filtering anchors
##  Retained 1851 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4774 anchors
## Filtering anchors
##  Retained 1949 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 5844 anchors
## Filtering anchors
##  Retained 1708 anchors
validation.int <- IntegrateData(anchorset = validation.anchors,
                                dims = 1:30)
## Merging dataset 3 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 2 into 1 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Adding a command log without an assay associated with it
DefaultAssay(validation.int) <- "integrated"

Scaling, Dimensional reduction, Clustering ——————————

Scaling and PCA

# Scaling
validation.int <- ScaleData(validation.int)
## Centering and scaling data matrix
# Dim reduction w/PCA
validation.int <- RunPCA(validation.int)
## PC_ 1 
## Positive:  Cd24a, Tubb3, Rtn1, Nsg1, Sox11, Mllt11, Tagln3, Sox4, Elavl3, Igfbpl1 
##     6330403K07Rik, Dpysl3, Dcx, Neurod2, Stmn3, Nsg2, Uchl1, Basp1, Neurod6, Map1b 
##     Nfix, Stmn4, Kif5c, Hist3h2ba, Fnbp1l, Cdk5r1, Gria2, Mapt, Thra, Nfib 
## Negative:  Igfbp7, Vamp8, Sparcl1, Slc2a1, Gng11, Sparc, Nfkbia, Ramp2, Slc7a5, Zfp36l1 
##     Ecscr, Itm2a, Ifitm3, AU021092, Anxa3, Esam, Sepp1, Rhoc, Cd34, Hes1 
##     Cyba, Ostf1, Serpinh1, Col4a2, Slc40a1, Col4a1, B2m, Tgfb1, Eng, Arhgap29 
## PC_ 2 
## Positive:  Dbi, Vim, Gng5, Phgdh, Slc1a3, Mfge8, Qk, Ddah1, Pea15a, Ndrg2 
##     Mt3, 1810037I17Rik, Fabp7, Plpp3, Prdx1, Aldoc, Ckb, H2afz, Slc9a3r1, Notch1 
##     Hmgb2, Atp1a2, Prdx6, Asrgl1, Gnai2, H2afv, Mid1ip1, Oat, Psph, Sox9 
## Negative:  Tubb3, Mllt11, Nsg1, Stmn3, Rtn1, Cd24a, Mapt, Stmn2, Stmn4, Elavl3 
##     Dpysl3, Map1b, Dcx, Uchl1, Tmsb10, Sox11, Thra, Basp1, Nsg2, Neurod2 
##     Gria2, Cdk5r1, Ly6h, Neurod6, 6330403K07Rik, Nrep, Sox4, Gng2, Tagln3, Myt1l 
## PC_ 3 
## Positive:  Hmgb2, H2afz, Cdca3, Lockd, Pbk, Ccna2, Racgap1, Birc5, H2afv, Cks2 
##     Smc2, Hmgn2, Cenpf, Smc4, Cdca8, Ube2c, Spc25, Mki67, Top2a, Spc24 
##     Nusap1, Prc1, Tpx2, Cenpa, Cdc20, Cenpe, Ccnb1, Phgdh, Tmpo, Cdkn2c 
## Negative:  Igfbp7, Bsg, Sparc, Cldn5, Col4a2, Gng11, Ramp2, Col4a1, Ecscr, Esam 
##     Slc2a1, Eng, Slc7a5, Itm2a, Slc3a2, Sparcl1, Eva1b, Vwa1, Kdr, Egfl7 
##     Fth1, Phlda1, Serpinh1, Pglyrp1, Ctla2a, Ppic, Tfpi, Sept4, Slc40a1, Myl6 
## PC_ 4 
## Positive:  Ube2s, Spc25, Top2a, Birc5, Ube2c, Cks2, Hmgb2, Hmgn2, Nusap1, Smc2 
##     Cdca8, Pbk, H2afz, Smc4, Cdca3, Ccna2, Cdk1, Cks1b, Tmpo, Prc1 
##     H2afx, Tpx2, Ccnb1, Cenpe, Mki67, H2afv, Racgap1, Lockd, Cenpf, Cenpa 
## Negative:  Aldoc, Ptprz1, Cspg5, Fabp7, Mt3, Slc1a3, Dbi, Ttyh1, Ndrg2, Hmgcs1 
##     Plpp3, Tnc, Hes5, Ccdc80, Pdpn, Mid1ip1, Mmd2, Luzp2, Fjx1, Pea15a 
##     Scd2, Tst, Cd9, Tspan7, Ddah1, Mlc1, Fgfbp3, Glud1, Vcam1, Qk 
## PC_ 5 
## Positive:  Meg3, Nrxn3, Tubb2a, Dlx2, Gng2, Stmn2, Arx, Dlx6os1, Dlx1, Thra 
##     Syt1, Ly6h, Ina, Dlx5, Sp9, Mef2c, Tmsb10, Gad2, Gap43, Map1b 
##     Xist, Stmn3, Gm13889, Mapt, Maf, Pcsk1n, Rpp25, Gad1, Gria1, Nxph1 
## Negative:  Igfbpl1, Mdk, Ppp2r2b, Lhx2, Eomes, Pantr1, Tsc22d1, Nfix, Plekhf2, Unc5d 
##     Nfib, Pou3f2, Ezr, Itm2b, Mfap4, Rnd2, Neurod2, Sox11, Igsf8, Cdk2ap1 
##     Pam, Sstr2, Tagln3, Ddah2, Mllt3, Dhrs4, Ttc28, Plcb1, Rasgef1b, Neurod6
ElbowPlot(validation.int, 
          ndims = 40)

Jackstraw plot

# Jackstraw plot
# Takes a bit of time to run
validation.int <- JackStraw(validation.int,
                            dims = 50)
validation.int <- ScoreJackStraw(validation.int, 
                                 dims = 1:50)
JackStrawPlot(validation.int, 
              dims = 1:50)

Clustering

# Clustering
usepcs_int <- 1:50
validation.int <- FindNeighbors(object = validation.int,
                                dims = usepcs_int)
## Computing nearest neighbor graph
## Computing SNN
validation.int <- FindClusters(object = validation.int,
                               resolution = seq(0.10, 2, 0.10))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9283
## Number of communities: 5
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8849
## Number of communities: 6
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8630
## Number of communities: 8
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8466
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8328
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8193
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8062
## Number of communities: 13
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7937
## Number of communities: 14
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7839
## Number of communities: 15
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7745
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7652
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7567
## Number of communities: 17
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7488
## Number of communities: 18
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7414
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7342
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7267
## Number of communities: 20
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7198
## Number of communities: 21
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7136
## Number of communities: 22
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7082
## Number of communities: 23
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5991
## Number of edges: 287554
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7028
## Number of communities: 26
## Elapsed time: 0 seconds
grep("res", colnames(validation.int@meta.data), value = TRUE) %>%
  purrr::map_chr(~ paste(.x, "--> clusters generated:", length(unique(
    validation.int@meta.data[, .x]))))
##  [1] "integrated_snn_res.0.1 --> clusters generated: 5" 
##  [2] "integrated_snn_res.0.2 --> clusters generated: 6" 
##  [3] "integrated_snn_res.0.3 --> clusters generated: 8" 
##  [4] "integrated_snn_res.0.4 --> clusters generated: 11"
##  [5] "integrated_snn_res.0.5 --> clusters generated: 12"
##  [6] "integrated_snn_res.0.6 --> clusters generated: 13"
##  [7] "integrated_snn_res.0.7 --> clusters generated: 13"
##  [8] "integrated_snn_res.0.8 --> clusters generated: 14"
##  [9] "integrated_snn_res.0.9 --> clusters generated: 15"
## [10] "integrated_snn_res.1 --> clusters generated: 17"  
## [11] "integrated_snn_res.1.1 --> clusters generated: 18"
## [12] "integrated_snn_res.1.2 --> clusters generated: 17"
## [13] "integrated_snn_res.1.3 --> clusters generated: 18"
## [14] "integrated_snn_res.1.4 --> clusters generated: 21"
## [15] "integrated_snn_res.1.5 --> clusters generated: 21"
## [16] "integrated_snn_res.1.6 --> clusters generated: 20"
## [17] "integrated_snn_res.1.7 --> clusters generated: 21"
## [18] "integrated_snn_res.1.8 --> clusters generated: 22"
## [19] "integrated_snn_res.1.9 --> clusters generated: 23"
## [20] "integrated_snn_res.2 --> clusters generated: 26"
Idents(validation.int) <- "integrated_snn_res.0.3"

Dimension reduction

# Dimension reduction
validation.int <- RunTSNE(object = validation.int, 
                          reduction.use = "pca",
                          dims = usepcs_int,
                          do.fast = TRUE)

validation.int <- RunUMAP(validation.int,
                          reduction = "pca",
                          dims = usepcs_int)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:04:51 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:04:51 Read 5991 rows and found 50 numeric columns
## 11:04:51 Using Annoy for neighbor search, n_neighbors = 30
## 11:04:51 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:04:52 Writing NN index file to temp file /var/folders/r9/z4byyk895zbc4qs07n05cph80000gn/T//Rtmpgf5kfA/filea895436966b8
## 11:04:52 Searching Annoy index using 1 thread, search_k = 3000
## 11:04:53 Annoy recall = 100%
## 11:04:54 Commencing smooth kNN distance calibration using 1 thread
## 11:04:55 Initializing from normalized Laplacian + noise
## 11:04:55 Commencing optimization for 500 epochs, with 273440 positive edges
## 11:05:04 Optimization finished

Saving object

# Saving the datasets in different formats for interoperability with python packages
save(validation.int, file = "data/final_brainsmall_val_int_clust.RData")
SaveH5Seurat(validation.int,
             filename = "final_brainsmall_val_int_clust.h5Seurat",
             overwrite = FALSE)
Convert("final_brainsmall_val_int_clust.h5Seurat", dest = "h5ad")

Visualization with scater ———————————————–

Prep for plotting

# scater is a toolkit like Seurat but uses a different data structure (Single Cell Experiment (SCE))

# First will use seurat to generate the SCE data by calling the as.SingleCellExperiment() on our Seurat object

# Removing negative expression values before converting to SCE object
cleaned_Hmgb2 <- WhichCells(validation.int, 
                            expression = Hmgb2 >= 0)
validation.int <- subset(validation.int, cells = cleaned_Hmgb2)

# Converting the seurat object into a SCE object
val_int_sce <- as.SingleCellExperiment(validation.int)
names(assays(val_int_sce)) <- "counts" # making sure assay name is correct for expression values

# Creating an object "logcounts" which houses log2 transformed count values
logcounts(val_int_sce) <- log2(counts(val_int_sce)+1)
## Warning in eval(call, parent.frame()): NaNs produced
dim(logcounts(val_int_sce))
## [1] 3000 5953

Plotting w/scater

p_clusters <-
  plotUMAP(val_int_sce, colour_by = "ident", other_fields = "dataset_label") +
  facet_wrap( ~ dataset_label) +
  theme(legend.title = element_blank(),
        strip.text.x = element_text(face = "bold"))
p_clusters
Clusters

Clusters

ggsave(
  filename = "plots/integrated/brainsmall_final_validation_scater_umap.png",
  dpi = 320,
  units = "in",
  width = 8,
  height = 4
)
p_Hmgb2 <- plotExpression(
  val_int_sce,
  features = c("Hmgb2"),
  colour_by = "ident",
  x = "ident",
  other_fields = "dataset_label",
  exprs_values = "logcounts"
) +
  facet_wrap( ~ dataset_label) +
  xlab("Cluster") +
  ggtitle("Marker: Hmgb2") +
  theme(legend.title = element_blank(),
        strip.text.x = element_text(face = "bold"))
p_Hmgb2
Violin Plots

Violin Plots

ggsave(
  filename = "plots/integrated/brainsmall_final_validation_scater_violin_Hmgb2_logcounts.png",
  dpi = 320,
  units = "in",
  width = 8,
  height = 4
)
p_density_Hmgb2 <-
  plotUMAP(val_int_sce, colour_by = "Hmgb2", other_fields = "dataset_label") +
  facet_wrap( ~ dataset_label) +
  scale_fill_viridis_c(option = "B") +
  ggtitle("Marker: Hmgb2") +
  theme(legend.title = element_blank(),
        strip.text.x = element_text(face = "bold"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p_density_Hmgb2
Expression Plot

Expression Plot

ggsave(
  filename = "plots/integrated/brainsmall_final_validation_scater_UMAPexpression_Hmgb2.png",
  dpi = 320,
  units = "in",
  width = 8,
  height = 4
)